Analysis of Pairwise Overlap Between HH Stages

Embryo measurements | Schematic representation of the measurements taken

1. Import and tidy the data


# Load libraries
library(tidyverse)
library(here)
library(ggridges)  # for ridge plot visualization
library(reshape2)  # to use melt()
library(patchwork) # to combine plots with wrap_plots()

#
## Import raw data
#
raw_embryo_length <-
  readr::read_delim(
    file = here::here("chick_elong_overlap/data_embryo_elongation/input/ChickElongQuant_input.csv"),
    delim = ",",
    na = "",
    escape_double = FALSE,
    trim_ws = TRUE, 
    locale = locale(decimal_mark = ".") # Important | To deal with the fact that in Portugal, sometimes, the decimals are commas.
  )

#
## Format the dataset
#

# Vector with the HH stages ordered 
hh_levels_order <- c("HH4","HH5","HH6","HH7","HH8","HH9","HH10")
nr_of_embryos <- length(unique(raw_embryo_length$Embryo))

# Wide format
clean_embryo_length <- 
  raw_embryo_length |>
  dplyr::select(-NumberMeasuredFrames, -CultureType) |>
  dplyr::filter(!is.na(HHStage)) |>
  dplyr::mutate(HHStage = paste0("HH", HHStage),
                HHStage = factor(HHStage, levels = hh_levels_order),
                Embryo = factor(Embryo, levels = 1:nr_of_embryos)) |>
  dplyr::relocate(c(Embryo, HHStage), .before = Time)


# Pivot the data to long format for plotting
clean_embryo_length_longer <-
  clean_embryo_length |>
  tidyr::pivot_longer(
    cols = !c(Embryo, HHStage, Time),
    names_to = "Measure",
    values_to = "Length_mm"
  )


# Measurements description (for plots)
measurements <- tibble::tibble(
  measure = c(
    "1.Total embryo length (mm)",
    "2.Total PS length (mm)",
    "3.Notochord length (mm)",
    "4.Posterior length (mm)",
    "5.PS length (mm)",
    "6.PSM length (mm)",
    "7.Anterior length (mm)",
    "8.Segmented length (mm)",
    "9.Head fold length (mm)"
  ),
  acronym = c(
    "C-pPL",
    "C-PS",
    "C-N",
    "N-pPL",
    "N-PS",
    "PSM",
    "C-Seg",
    "SEG",
    "C-HF"
  ),
  my_labs = c(
    "C-pPL | Total embryo length",
    "C-PS | Total Primitive Streak length",
    "C-N | Notochord length",
    "N-pPL | Posterior region length",
    "N-PS | Primitive Streak length",
    "PSM | Presomitic Mesoderm length",
    "C-Seg | Anterior region length",
    "SEG | Segmented region length",
    "C-HF | Head fold length"
  )
)

2. Overview of measurements taken


## Summary stats
summary(clean_embryo_length)
#>      Embryo    HHStage        Time            C-pPL            C-PS      
#>  19     : 30   HH4 :43   Min.   :-7.600   Min.   :1.630   Min.   :1.530  
#>  5      : 28   HH5 :99   1st Qu.: 3.362   1st Qu.:3.618   1st Qu.:3.160  
#>  3      : 25   HH6 :50   Median : 9.150   Median :4.715   Median :4.140  
#>  4      : 25   HH7 :47   Mean   : 9.188   Mean   :4.585   Mean   :4.024  
#>  8      : 25   HH8 :71   3rd Qu.:15.090   3rd Qu.:5.572   3rd Qu.:4.878  
#>  11     : 24   HH9 :57   Max.   :24.150   Max.   :7.050   Max.   :6.400  
#>  (Other):233   HH10:23                    NA's   :2       NA's   :4      
#>       C-N            N-pPL            N-PS            PSM        
#>  Min.   :0.080   Min.   :1.020   Min.   :0.870   Min.   :0.3000  
#>  1st Qu.:1.185   1st Qu.:2.110   1st Qu.:1.520   1st Qu.:0.6400  
#>  Median :2.380   Median :2.450   Median :1.890   Median :0.8100  
#>  Mean   :2.318   Mean   :2.518   Mean   :1.915   Mean   :0.8443  
#>  3rd Qu.:3.380   3rd Qu.:2.910   3rd Qu.:2.225   3rd Qu.:1.0200  
#>  Max.   :5.000   Max.   :3.860   Max.   :3.110   Max.   :1.5900  
#>  NA's   :43      NA's   :45      NA's   :47      NA's   :192     
#>      C-Seg            SEG              C-HF       
#>  Min.   :1.180   Min.   :0.0800   Min.   :0.0600  
#>  1st Qu.:1.817   1st Qu.:0.3600   1st Qu.:0.4150  
#>  Median :1.933   Median :0.6550   Median :0.8700  
#>  Mean   :1.944   Mean   :0.6816   Mean   :0.8743  
#>  3rd Qu.:2.070   3rd Qu.:1.0100   3rd Qu.:1.2650  
#>  Max.   :2.310   Max.   :1.7300   Max.   :1.9000  
#>  NA's   :214     NA's   :192      NA's   :155

## Look at the Number of embryos per HH stage
clean_embryo_length |>
  # Reverse the order of the stacks (from bottom to top)
  dplyr::mutate(HHStage = factor(HHStage, levels = rev(unique(HHStage)))) |>
  dplyr::count(Embryo, HHStage) |>
  ggplot(aes(x = Embryo, y = n, fill = HHStage)) + 
  geom_col() +
  ylab("Number of measures taken per HH Stage") +
  ggtitle("Number of Measures per HHStage in Each Embryo") +
  geom_text(aes(label = n), position = position_stack(vjust = 0.5), 
            color = "black", size = 3.5) +
  theme_minimal() 

3. Ridge plots | Time and Length measurements


## List to save ridge plots
ridge_plots_all <- list() 


## Ridge plots | Per Time
ridge_plots_all$Time <-
  clean_embryo_length_longer |>
  ggplot(mapping = aes(x = Time, y =HHStage, fill = HHStage)) +
  geom_density_ridges(
    alpha = 0.75,
    rel_min_height = 0.001,
    scale = 1,
    quantile_lines = TRUE,
    quantiles = 2,
    jittered_points = TRUE,
    position = "points_sina",
    point_alpha = 0.6,
    point_size = 0.5
  ) +
  theme_ridges(grid = TRUE, center_axis_labels = TRUE) +
  theme(legend.position = "none") +
  #xlim(c(0, 7.1)) +
  #scale_x_continuous(breaks = seq(0, 8, 1)) +
  xlab("Time (min)") +
  ylab('HH stage') +
  ggtitle("Embryo time distribution per HH stage")



## Ridge plots | Faceted per measure
ridge_plots_all$faceted_length <- 
  ggplot(clean_embryo_length_longer, aes(x = Length_mm, y = HHStage, fill = HHStage)) +
  ggridges::geom_density_ridges(
    alpha = 0.75,
    rel_min_height = 0.001,
    scale = 2,
    quantile_lines = TRUE,
    quantiles = 2,
    jittered_points = TRUE,
    position = "points_sina",
    point_alpha = 0.8,
    point_size = 0.5
  ) +
  theme_ridges(grid = TRUE, center_axis_labels = TRUE) +
  facet_wrap(~Measure, scales = "free_x") +
  theme(legend.position = "none") +
  labs(
    title = "Embryo Measurement Distributions per HH Stage",
    x = "Measurement Value",
    y = "HH Stage"
  ) 


## Ridge plots | Individual plots per length measure
for (my_measure in measurements$acronym) {
  plot_data <- clean_embryo_length_longer |>
    dplyr::filter(Measure == my_measure) |>
    drop_na(Length_mm)
  
  p <- ggplot(plot_data, aes(x = Length_mm, y = HHStage, fill = HHStage)) +
    geom_density_ridges(
      alpha = 0.75,
      rel_min_height = 0.001,
      scale = 2,
      quantile_lines = TRUE,
      quantiles = 2,
      jittered_points = TRUE,
      position = "points_sina",
      point_alpha = 0.8
    ) +
    scale_fill_manual(values = scales::hue_pal()(7), limits = levels(clean_embryo_length_longer$HHStage)) +
    theme_ridges(grid = TRUE, center_axis_labels = TRUE) +
    theme(legend.position = "none") +
    labs(
      title = paste("Distribution of", my_measure, "by HH Stage"),
      x = paste(my_measure, "(mm)"),
      y = "HH Stage"
    )
  
  ridge_plots_all[[my_measure]] <- p
}


## Print all ridge plots
ridge_plots_all
#> $Time

#> 
#> $faceted_length

#> 
#> $`C-pPL`

#> 
#> $`C-PS`

#> 
#> $`C-N`

#> 
#> $`N-pPL`

#> 
#> $`N-PS`

#> 
#> $PSM

#> 
#> $`C-Seg`

#> 
#> $SEG

#> 
#> $`C-HF`

4. Functions to calculate overlap between HHStages

Goal: Calculate the pairwise overlap between measurement distributions.

Overview of the approach

The idea is to calculate the integral of the minimum of the two density functions over the range where both have nonzero density. This provides a straightforward measure of similarity between two distributions, and therefore an intuitive way to quantify the overlap between the ridge plots (or kernel density estimates).

Simple Overlap Calculation

  1. Estimate Kernel Density: Compute the density estimates for both distributions using the same grid of points.
  2. Find the Intersection: At each grid point, take the minimum value of the two density functions.
  3. Integrate the Intersection: Sum (or integrate) these minimum density values across the grid to obtain the overlap score.

Mathematically:

Overlap = ∫ min(f1(x), f2(x)) dx

where f1(x) and f2(x) are the density functions of the two distributions.

Why This Works

  1. The approach is scale-invariant (not affected by total number of observations).
  2. It gives an intuitive measure where 0 means no overlap and 1 means complete overlap (if densities are normalized).
  3. It does not require complex statistics, just basic density estimation and summation.
#
# Function to compute overlap between two distributions
#
calculate_overlap <- function(x1, x2, bw = "SJ", n = 512) {
  d1 <- density(x1, bw = bw, n = n)
  d2 <- density(x2, bw = bw, n = n)
  
  # Find common range where densities are nonzero
  lower_bound <- max(min(d1$x), min(d2$x))
  upper_bound <- min(max(d1$x), max(d2$x))
  
  # If there is no overlap, return 0
  if (lower_bound >= upper_bound) return(0)
  
  # Define the function for the minimum of two densities
  min_density_function <- function(x) {
    f1 <- approx(d1$x, d1$y, xout = x, rule = 2)$y
    f2 <- approx(d2$x, d2$y, xout = x, rule = 2)$y
    pmin(f1, f2)
  }
  
  # Compute the integral over the overlapping range
  overlap_value <- integrate(min_density_function, lower = lower_bound, upper = upper_bound)$value
  
  return(overlap_value)
}

#
# Compute pairwise overlap matrix between embryo measurements
#
calculate_pairwise_overlap_matrix <- function(data_list, bw = "SJ", n = 512) {
  names_list <- str_sort(names(data_list), numeric = TRUE)
  if (is.null(names_list)) names_list <- paste0("D", seq_along(data_list))
  
  n_datasets <- length(data_list)
  overlap_matrix <- matrix(0, nrow = n_datasets, ncol = n_datasets,
                           dimnames = list(names_list, names_list))
  
  for (i in seq_len(n_datasets)) {
    for (j in seq_len(n_datasets)) {
      if (i <= j) {
        overlap_matrix[i, j] <- calculate_overlap(data_list[[i]], data_list[[j]], bw = bw, n = n)
        overlap_matrix[j, i] <- overlap_matrix[i, j] # Symmetric
      }
    }
  }
  
  return(overlap_matrix)
}

#
# Plot heatmap of overlap matrix
#
plot_overlap <- function(overlap_matrix, my_name, measurements = NULL) {
  names_list <- colnames(overlap_matrix)
  
  # Convert matrix to long format
  overlap_df <- reshape2::melt(overlap_matrix)
  colnames(overlap_df) <- c("Distribution1", "Distribution2", "Overlap")
  
  # Heatmap plot
  heatmap_plot <- overlap_df |>
    dplyr::filter(as.integer(Distribution2) <= as.integer(Distribution1)) |>
    ggplot2::ggplot(aes(x = Distribution1, y = Distribution2, fill = Overlap)) +
    ggplot2::geom_tile() +
    ggplot2::geom_text(aes(label = round(Overlap, 3)), color = "grey25", size = 5) +
    ggplot2::scale_fill_gradient(low = "white", high = "salmon") +
    ggplot2::theme_minimal() +
    ggplot2::labs(
      title = if (!is.null(measurements) && my_name %in% measurements$acronym) {
        paste(measurements[which(measurements$acronym == my_name), "my_labs", drop = TRUE],
              "| Density Overlap")
      } else {
        paste(my_name, "| Density Overlap")
      },
      fill = "Overlap"
    ) +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
  
  return(heatmap_plot)
}

5. Overlap between HH Stages | Time and Length


# Format the input data as list 
clean_measures_list <- clean_embryo_length |>
  select(-Embryo) |>
  pivot_longer(-HHStage, names_to = "Med", values_to = "Length") |>
  drop_na() |>
  group_by(Med, HHStage) |>
  summarise(Length = list(Length), .groups = "drop") |>
  nest(Length = c(HHStage, Length)) |>
  mutate(Length = map(Length, ~set_names(.x$Length, .x$HHStage))) |>
  deframe()


# Calculate the distribution overlap for all measurements
overlap_matrix <- setNames(
  lapply(clean_measures_list, calculate_pairwise_overlap_matrix),
  names(clean_measures_list)
)

# Generate overlap heatmaps for all measurements
overlap_heatmaps <- lapply(names(overlap_matrix), function(my_name) {
  plot_overlap(overlap_matrix[[my_name]], my_name = my_name)
})

# Assign names to heatmaps list for clarity
names(overlap_heatmaps) <- names(overlap_matrix)

# Combine each heatmap with its corresponding ridge plot into a 2-column layout
combined_overlap_ridge <- purrr::imap(overlap_heatmaps, function(heatmap, name) {
  ridge <- ridge_plots_all[[name]]
  patchwork::wrap_plots(ridge, heatmap, ncol = 2)
})


# Print combined plots
combined_overlap_ridge
#> $`C-HF`

#> 
#> $`C-N`

#> 
#> $`C-PS`

#> 
#> $`C-Seg`

#> 
#> $`C-pPL`

#> 
#> $`N-PS`

#> 
#> $`N-pPL`

#> 
#> $PSM

#> 
#> $SEG

#> 
#> $Time

6. Session info and save RData


### Save the RData file ## Uncomment if needed
# save.image(file = here::here("chick_elong_overlap/data_embryo_elongation/embryo_elongation_analysis.RData"))

# Session info
sessionInfo()
#> R version 4.5.0 (2025-04-11)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Europe/Lisbon
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] patchwork_1.3.0 reshape2_1.4.4  ggridges_0.5.6  here_1.0.1     
#>  [5] lubridate_1.9.4 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
#>  [9] purrr_1.0.4     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
#> [13] ggplot2_3.5.2   tidyverse_2.0.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] sass_0.4.10        generics_0.1.4     stringi_1.8.7      hms_1.1.3         
#>  [5] digest_0.6.37      magrittr_2.0.3     evaluate_1.0.3     grid_4.5.0        
#>  [9] timechange_0.3.0   RColorBrewer_1.1-3 fastmap_1.2.0      rprojroot_2.0.4   
#> [13] plyr_1.8.9         jsonlite_2.0.0     scales_1.4.0       jquerylib_0.1.4   
#> [17] cli_3.6.5          rlang_1.1.6        crayon_1.5.3       bit64_4.6.0-1     
#> [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.10        tools_4.5.0       
#> [25] parallel_4.5.0     tzdb_0.5.0         vctrs_0.6.5        R6_2.6.1          
#> [29] lifecycle_1.0.4    bit_4.6.0          vroom_1.6.5        pkgconfig_2.0.3   
#> [33] pillar_1.10.2      bslib_0.9.0        gtable_0.3.6       glue_1.8.0        
#> [37] Rcpp_1.0.14        xfun_0.52          tidyselect_1.2.1   rstudioapi_0.17.1 
#> [41] knitr_1.50         farver_2.1.2       htmltools_0.5.8.1  labeling_0.4.3    
#> [45] rmarkdown_2.29     compiler_4.5.0